kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_entity_extraction.py +1 -2
- kreuzberg/_extractors/_base.py +39 -1
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +21 -36
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +81 -48
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +314 -7
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +91 -0
- kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg/_ocr/_tesseract.py +564 -4
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +52 -4
- kreuzberg/_utils/_errors.py +3 -7
- kreuzberg/_utils/_process_pool.py +180 -7
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- kreuzberg/cli.py +1 -2
- kreuzberg/extraction.py +4 -22
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
- kreuzberg-3.8.1.dist-info/RECORD +53 -0
- kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.7.0.dist-info/RECORD +0 -56
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_entity_extraction.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import os
|
3
4
|
import re
|
4
5
|
from dataclasses import dataclass
|
5
6
|
from functools import lru_cache
|
@@ -181,8 +182,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
181
182
|
import spacy
|
182
183
|
|
183
184
|
if spacy_config.model_cache_dir:
|
184
|
-
import os
|
185
|
-
|
186
185
|
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
187
186
|
|
188
187
|
nlp = spacy.load(model_name)
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -3,10 +3,12 @@ from __future__ import annotations
|
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from typing import TYPE_CHECKING, ClassVar
|
5
5
|
|
6
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
7
|
+
from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
|
8
|
+
|
6
9
|
if TYPE_CHECKING:
|
7
10
|
from pathlib import Path
|
8
11
|
|
9
|
-
from kreuzberg import ExtractionResult
|
10
12
|
from kreuzberg._types import ExtractionConfig
|
11
13
|
|
12
14
|
|
@@ -90,3 +92,39 @@ class Extractor(ABC):
|
|
90
92
|
return mime_type in cls.SUPPORTED_MIME_TYPES or any(
|
91
93
|
mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
|
92
94
|
)
|
95
|
+
|
96
|
+
def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
|
97
|
+
"""Apply quality post-processing to extraction result if enabled.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
result: The raw extraction result
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
Enhanced extraction result with quality improvements (if enabled)
|
104
|
+
"""
|
105
|
+
# Only apply quality processing if enabled in config
|
106
|
+
if not self.config.enable_quality_processing:
|
107
|
+
return result
|
108
|
+
|
109
|
+
if not result.content:
|
110
|
+
return result
|
111
|
+
|
112
|
+
# Clean the content
|
113
|
+
cleaned_content = clean_extracted_text(result.content)
|
114
|
+
|
115
|
+
# Calculate quality score
|
116
|
+
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
|
+
|
118
|
+
# Add quality metadata
|
119
|
+
enhanced_metadata = dict(result.metadata) if result.metadata else {}
|
120
|
+
enhanced_metadata["quality_score"] = quality_score
|
121
|
+
|
122
|
+
# Return enhanced result
|
123
|
+
return ExtractionResult(
|
124
|
+
content=cleaned_content,
|
125
|
+
mime_type=result.mime_type,
|
126
|
+
metadata=normalize_metadata(enhanced_metadata),
|
127
|
+
chunks=result.chunks,
|
128
|
+
detected_languages=result.detected_languages,
|
129
|
+
tables=result.tables,
|
130
|
+
)
|
@@ -0,0 +1,149 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from html import unescape
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
6
|
+
|
7
|
+
from anyio import Path as AsyncPath
|
8
|
+
|
9
|
+
from kreuzberg._extractors._base import Extractor
|
10
|
+
from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
11
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
12
|
+
from kreuzberg._utils._string import normalize_spaces
|
13
|
+
from kreuzberg._utils._sync import run_sync
|
14
|
+
from kreuzberg.exceptions import MissingDependencyError
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from pathlib import Path
|
18
|
+
|
19
|
+
# Import optional dependencies at module level with proper error handling
|
20
|
+
try:
|
21
|
+
import mailparse
|
22
|
+
except ImportError:
|
23
|
+
mailparse = None
|
24
|
+
|
25
|
+
try:
|
26
|
+
import html2text # type: ignore[import-not-found]
|
27
|
+
except ImportError:
|
28
|
+
html2text = None
|
29
|
+
|
30
|
+
# Compile regex pattern once at module level
|
31
|
+
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
32
|
+
|
33
|
+
|
34
|
+
class EmailExtractor(Extractor):
|
35
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {EML_MIME_TYPE}
|
36
|
+
|
37
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
38
|
+
return await run_sync(self.extract_bytes_sync, content)
|
39
|
+
|
40
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
41
|
+
content = await AsyncPath(path).read_bytes()
|
42
|
+
return await self.extract_bytes_async(content)
|
43
|
+
|
44
|
+
def _extract_email_headers(
|
45
|
+
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
46
|
+
) -> None:
|
47
|
+
"""Extract and process email headers."""
|
48
|
+
# Use single dict access where possible to avoid repeated lookups
|
49
|
+
subject = parsed_email.get("subject")
|
50
|
+
if subject:
|
51
|
+
metadata["subject"] = subject
|
52
|
+
text_parts.append(f"Subject: {subject}")
|
53
|
+
|
54
|
+
from_info = parsed_email.get("from")
|
55
|
+
if from_info:
|
56
|
+
from_email = from_info.get("email", "") if isinstance(from_info, dict) else str(from_info)
|
57
|
+
metadata["email_from"] = from_email
|
58
|
+
text_parts.append(f"From: {from_email}")
|
59
|
+
|
60
|
+
to_info = parsed_email.get("to")
|
61
|
+
if to_info:
|
62
|
+
if isinstance(to_info, list) and to_info:
|
63
|
+
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
64
|
+
elif isinstance(to_info, dict):
|
65
|
+
to_email = to_info.get("email", "")
|
66
|
+
else:
|
67
|
+
to_email = str(to_info)
|
68
|
+
metadata["email_to"] = to_email
|
69
|
+
text_parts.append(f"To: {to_email}")
|
70
|
+
|
71
|
+
date = parsed_email.get("date")
|
72
|
+
if date:
|
73
|
+
metadata["date"] = date
|
74
|
+
text_parts.append(f"Date: {date}")
|
75
|
+
|
76
|
+
cc = parsed_email.get("cc")
|
77
|
+
if cc:
|
78
|
+
metadata["email_cc"] = cc
|
79
|
+
text_parts.append(f"CC: {cc}")
|
80
|
+
|
81
|
+
bcc = parsed_email.get("bcc")
|
82
|
+
if bcc:
|
83
|
+
metadata["email_bcc"] = bcc
|
84
|
+
text_parts.append(f"BCC: {bcc}")
|
85
|
+
|
86
|
+
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
87
|
+
"""Extract and process email body content."""
|
88
|
+
text_content = parsed_email.get("text")
|
89
|
+
if text_content:
|
90
|
+
text_parts.append(f"\n{text_content}")
|
91
|
+
return # If we have text, prefer it over HTML
|
92
|
+
|
93
|
+
html_content = parsed_email.get("html")
|
94
|
+
if html_content:
|
95
|
+
if html2text is not None:
|
96
|
+
# Use html2text if available (faster path)
|
97
|
+
h = html2text.HTML2Text()
|
98
|
+
h.ignore_links = True
|
99
|
+
h.ignore_images = True
|
100
|
+
converted_text = h.handle(html_content)
|
101
|
+
text_parts.append(f"\n{converted_text}")
|
102
|
+
else:
|
103
|
+
# Fallback: strip HTML tags and unescape entities
|
104
|
+
clean_html = _HTML_TAG_PATTERN.sub("", html_content)
|
105
|
+
clean_html = unescape(clean_html)
|
106
|
+
text_parts.append(f"\n{clean_html}")
|
107
|
+
|
108
|
+
def _extract_email_attachments(
|
109
|
+
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
110
|
+
) -> None:
|
111
|
+
"""Extract and process email attachments info."""
|
112
|
+
if parsed_email.get("attachments"):
|
113
|
+
attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
|
114
|
+
metadata["attachments"] = attachment_names
|
115
|
+
if attachment_names:
|
116
|
+
text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
|
117
|
+
|
118
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
119
|
+
if mailparse is None:
|
120
|
+
msg = "mailparse is required for email extraction. Install with: pip install 'kreuzberg[additional-extensions]'"
|
121
|
+
raise MissingDependencyError(msg)
|
122
|
+
|
123
|
+
try:
|
124
|
+
parsed_email = mailparse.EmailDecode.load(content)
|
125
|
+
text_parts: list[str] = []
|
126
|
+
metadata: dict[str, Any] = {}
|
127
|
+
|
128
|
+
# Extract headers, body, and attachments
|
129
|
+
self._extract_email_headers(parsed_email, text_parts, metadata)
|
130
|
+
self._extract_email_body(parsed_email, text_parts)
|
131
|
+
self._extract_email_attachments(parsed_email, text_parts, metadata)
|
132
|
+
|
133
|
+
# Join efficiently
|
134
|
+
combined_text = "\n".join(text_parts)
|
135
|
+
|
136
|
+
return ExtractionResult(
|
137
|
+
content=normalize_spaces(combined_text),
|
138
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
139
|
+
metadata=normalize_metadata(metadata),
|
140
|
+
chunks=[],
|
141
|
+
)
|
142
|
+
|
143
|
+
except Exception as e:
|
144
|
+
msg = f"Failed to parse email content: {e}"
|
145
|
+
raise RuntimeError(msg) from e
|
146
|
+
|
147
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
148
|
+
content = path.read_bytes()
|
149
|
+
return self.extract_bytes_sync(content)
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -8,7 +8,7 @@ from anyio import Path as AsyncPath
|
|
8
8
|
from kreuzberg._extractors._base import Extractor
|
9
9
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
10
10
|
from kreuzberg._types import ExtractionResult
|
11
|
-
from kreuzberg._utils._string import
|
11
|
+
from kreuzberg._utils._string import safe_decode
|
12
12
|
from kreuzberg._utils._sync import run_sync
|
13
13
|
|
14
14
|
if TYPE_CHECKING:
|
@@ -26,8 +26,20 @@ class HTMLExtractor(Extractor):
|
|
26
26
|
return await run_sync(self.extract_bytes_sync, content)
|
27
27
|
|
28
28
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
29
|
-
|
30
|
-
|
29
|
+
# Use html-to-markdown with script/nav removal for better quality
|
30
|
+
result = html_to_markdown.convert_to_markdown(
|
31
|
+
safe_decode(content),
|
32
|
+
preprocess_html=True,
|
33
|
+
preprocessing_preset="aggressive",
|
34
|
+
remove_navigation=True,
|
35
|
+
remove_forms=True,
|
36
|
+
)
|
37
|
+
|
38
|
+
# Skip normalize_spaces since quality processing will handle whitespace
|
39
|
+
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
|
40
|
+
|
41
|
+
# Apply quality processing which includes normalization
|
42
|
+
return self._apply_quality_processing(extraction_result)
|
31
43
|
|
32
44
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
33
45
|
content = path.read_bytes()
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import contextlib
|
4
|
+
import os
|
5
|
+
import tempfile
|
6
|
+
from pathlib import Path
|
3
7
|
from typing import TYPE_CHECKING, ClassVar
|
4
8
|
|
5
9
|
from anyio import Path as AsyncPath
|
@@ -7,6 +11,9 @@ from anyio import Path as AsyncPath
|
|
7
11
|
from kreuzberg._extractors._base import Extractor
|
8
12
|
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
9
13
|
from kreuzberg._ocr import get_ocr_backend
|
14
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
15
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
16
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
10
17
|
from kreuzberg._utils._tmp import create_temp_file
|
11
18
|
from kreuzberg.exceptions import ValidationError
|
12
19
|
|
@@ -15,9 +22,6 @@ if TYPE_CHECKING: # pragma: no cover
|
|
15
22
|
|
16
23
|
from kreuzberg._types import ExtractionResult
|
17
24
|
|
18
|
-
import contextlib
|
19
|
-
from pathlib import Path
|
20
|
-
|
21
25
|
|
22
26
|
class ImageExtractor(Extractor):
|
23
27
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
|
@@ -56,13 +60,11 @@ class ImageExtractor(Extractor):
|
|
56
60
|
if self.config.ocr_backend is None:
|
57
61
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
58
62
|
|
59
|
-
|
63
|
+
result = await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
|
64
|
+
return self._apply_quality_processing(result)
|
60
65
|
|
61
66
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
62
67
|
"""Pure sync implementation of extract_bytes."""
|
63
|
-
import os
|
64
|
-
import tempfile
|
65
|
-
|
66
68
|
extension = self._get_extension_from_mime_type(self.mime_type)
|
67
69
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
68
70
|
|
@@ -80,43 +82,26 @@ class ImageExtractor(Extractor):
|
|
80
82
|
if self.config.ocr_backend is None:
|
81
83
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
82
84
|
|
83
|
-
|
85
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
84
86
|
|
85
87
|
if self.config.ocr_backend == "tesseract":
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
else:
|
92
|
-
config = TesseractConfig()
|
93
|
-
|
94
|
-
results = process_batch_images_sync_pure([str(path)], config)
|
95
|
-
if results:
|
96
|
-
return results[0]
|
97
|
-
return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
|
98
|
-
|
99
|
-
if self.config.ocr_backend == "paddleocr":
|
100
|
-
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
101
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
102
|
-
|
88
|
+
config = (
|
89
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
90
|
+
)
|
91
|
+
result = backend.process_file_sync(path, **config.__dict__)
|
92
|
+
elif self.config.ocr_backend == "paddleocr":
|
103
93
|
paddle_config = (
|
104
94
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
105
95
|
)
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
if self.config.ocr_backend == "easyocr":
|
110
|
-
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
111
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
112
|
-
|
96
|
+
result = backend.process_file_sync(path, **paddle_config.__dict__)
|
97
|
+
elif self.config.ocr_backend == "easyocr":
|
113
98
|
easy_config = (
|
114
99
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
115
100
|
)
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
101
|
+
result = backend.process_file_sync(path, **easy_config.__dict__)
|
102
|
+
else:
|
103
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
104
|
+
return self._apply_quality_processing(result)
|
120
105
|
|
121
106
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
122
107
|
if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
|
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import contextlib
|
4
|
+
import os
|
4
5
|
import re
|
6
|
+
import subprocess
|
5
7
|
import sys
|
8
|
+
import tempfile
|
6
9
|
from json import JSONDecodeError, loads
|
7
10
|
from pathlib import Path
|
8
11
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
|
@@ -203,10 +206,6 @@ class PandocExtractor(Extractor):
|
|
203
206
|
Returns:
|
204
207
|
ExtractionResult with the extracted text and metadata.
|
205
208
|
"""
|
206
|
-
import os
|
207
|
-
import tempfile
|
208
|
-
from pathlib import Path
|
209
|
-
|
210
209
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
211
210
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
212
211
|
|
@@ -579,8 +578,6 @@ class PandocExtractor(Extractor):
|
|
579
578
|
|
580
579
|
def _validate_pandoc_version_sync(self) -> None:
|
581
580
|
"""Synchronous version of _validate_pandoc_version."""
|
582
|
-
import subprocess
|
583
|
-
|
584
581
|
try:
|
585
582
|
if self._checked_version:
|
586
583
|
return
|
@@ -625,10 +622,6 @@ class PandocExtractor(Extractor):
|
|
625
622
|
|
626
623
|
def _extract_metadata_sync(self, path: Path) -> Metadata:
|
627
624
|
"""Synchronous version of _handle_extract_metadata."""
|
628
|
-
import os
|
629
|
-
import subprocess
|
630
|
-
import tempfile
|
631
|
-
|
632
625
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
633
626
|
fd, metadata_file = tempfile.mkstemp(suffix=".json")
|
634
627
|
os.close(fd)
|
@@ -663,10 +656,6 @@ class PandocExtractor(Extractor):
|
|
663
656
|
|
664
657
|
def _extract_file_sync(self, path: Path) -> str:
|
665
658
|
"""Synchronous version of _handle_extract_file."""
|
666
|
-
import os
|
667
|
-
import subprocess
|
668
|
-
import tempfile
|
669
|
-
|
670
659
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
671
660
|
fd, output_path = tempfile.mkstemp(suffix=".md")
|
672
661
|
os.close(fd)
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import contextlib
|
4
|
+
import os
|
5
|
+
import tempfile
|
4
6
|
from multiprocessing import cpu_count
|
5
7
|
from pathlib import Path
|
6
8
|
from re import Pattern
|
@@ -10,15 +12,21 @@ from typing import TYPE_CHECKING, ClassVar, cast
|
|
10
12
|
import anyio
|
11
13
|
import pypdfium2
|
12
14
|
from anyio import Path as AsyncPath
|
15
|
+
from playa import parse
|
13
16
|
|
14
17
|
from kreuzberg._extractors._base import Extractor
|
15
18
|
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
16
19
|
from kreuzberg._ocr import get_ocr_backend
|
17
|
-
from kreuzberg.
|
20
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
21
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
22
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
23
|
+
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
18
24
|
from kreuzberg._types import ExtractionResult, OcrBackendType
|
25
|
+
from kreuzberg._utils._errors import create_error_context, should_retry
|
19
26
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
20
27
|
from kreuzberg._utils._string import normalize_spaces
|
21
28
|
from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
|
29
|
+
from kreuzberg._utils._table import generate_table_summary
|
22
30
|
from kreuzberg._utils._tmp import create_temp_file
|
23
31
|
from kreuzberg.exceptions import ParsingError
|
24
32
|
|
@@ -63,17 +71,30 @@ class PDFExtractor(Extractor):
|
|
63
71
|
result.metadata = await extract_pdf_metadata(content_bytes)
|
64
72
|
|
65
73
|
if self.config.extract_tables:
|
66
|
-
|
67
|
-
|
68
|
-
|
74
|
+
# GMFT is optional dependency
|
75
|
+
try:
|
76
|
+
from kreuzberg._gmft import extract_tables
|
69
77
|
|
70
|
-
|
78
|
+
result.tables = await extract_tables(path, self.config.gmft_config)
|
79
|
+
except ImportError:
|
80
|
+
result.tables = []
|
81
|
+
|
82
|
+
# Enhance metadata with table information
|
83
|
+
if result.tables:
|
84
|
+
table_summary = generate_table_summary(result.tables)
|
85
|
+
result.metadata.update(
|
86
|
+
{
|
87
|
+
"table_count": table_summary["table_count"],
|
88
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
89
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
90
|
+
f"{table_summary['total_rows']} total rows",
|
91
|
+
}
|
92
|
+
)
|
93
|
+
|
94
|
+
return self._apply_quality_processing(result)
|
71
95
|
|
72
96
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
73
97
|
"""Pure sync implementation of PDF extraction from bytes."""
|
74
|
-
import os
|
75
|
-
import tempfile
|
76
|
-
|
77
98
|
fd, temp_path = tempfile.mkstemp(suffix=".pdf")
|
78
99
|
try:
|
79
100
|
with os.fdopen(fd, "wb") as f:
|
@@ -81,8 +102,6 @@ class PDFExtractor(Extractor):
|
|
81
102
|
|
82
103
|
result = self.extract_path_sync(Path(temp_path))
|
83
104
|
|
84
|
-
from kreuzberg._playa import extract_pdf_metadata_sync
|
85
|
-
|
86
105
|
metadata = extract_pdf_metadata_sync(content)
|
87
106
|
result.metadata = metadata
|
88
107
|
|
@@ -100,16 +119,21 @@ class PDFExtractor(Extractor):
|
|
100
119
|
|
101
120
|
tables = []
|
102
121
|
if self.config.extract_tables:
|
122
|
+
# GMFT is optional dependency
|
103
123
|
try:
|
104
124
|
from kreuzberg._gmft import extract_tables_sync
|
105
125
|
|
106
126
|
tables = extract_tables_sync(path)
|
107
127
|
except ImportError:
|
108
|
-
|
128
|
+
tables = []
|
129
|
+
|
130
|
+
# Use playa for better text structure preservation when not using OCR
|
131
|
+
if not self.config.force_ocr and self._validate_extracted_text(text):
|
132
|
+
text = self._extract_with_playa_sync(path, fallback_text=text)
|
109
133
|
|
110
134
|
text = normalize_spaces(text)
|
111
135
|
|
112
|
-
|
136
|
+
result = ExtractionResult(
|
113
137
|
content=text,
|
114
138
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
115
139
|
metadata={},
|
@@ -117,6 +141,21 @@ class PDFExtractor(Extractor):
|
|
117
141
|
chunks=[],
|
118
142
|
)
|
119
143
|
|
144
|
+
# Enhance metadata with table information
|
145
|
+
if tables:
|
146
|
+
table_summary = generate_table_summary(tables)
|
147
|
+
result.metadata.update(
|
148
|
+
{
|
149
|
+
"table_count": table_summary["table_count"],
|
150
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
151
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
152
|
+
f"{table_summary['total_rows']} total rows",
|
153
|
+
}
|
154
|
+
)
|
155
|
+
|
156
|
+
# Apply quality processing
|
157
|
+
return self._apply_quality_processing(result)
|
158
|
+
|
120
159
|
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
121
160
|
"""Check if text extracted from PDF is valid or corrupted.
|
122
161
|
|
@@ -155,8 +194,6 @@ class PDFExtractor(Extractor):
|
|
155
194
|
Returns:
|
156
195
|
A list of Pillow Images.
|
157
196
|
"""
|
158
|
-
from kreuzberg._utils._errors import create_error_context, should_retry
|
159
|
-
|
160
197
|
document: pypdfium2.PdfDocument | None = None
|
161
198
|
last_error = None
|
162
199
|
|
@@ -228,8 +265,6 @@ class PDFExtractor(Extractor):
|
|
228
265
|
Returns:
|
229
266
|
The extracted text.
|
230
267
|
"""
|
231
|
-
from kreuzberg._utils._errors import create_error_context
|
232
|
-
|
233
268
|
document: pypdfium2.PdfDocument | None = None
|
234
269
|
try:
|
235
270
|
with pypdfium_file_lock(input_file):
|
@@ -283,7 +318,7 @@ class PDFExtractor(Extractor):
|
|
283
318
|
text_parts = []
|
284
319
|
for page in pdf:
|
285
320
|
text_page = page.get_textpage()
|
286
|
-
text = text_page.
|
321
|
+
text = text_page.get_text_bounded()
|
287
322
|
text_parts.append(text)
|
288
323
|
text_page.close()
|
289
324
|
page.close()
|
@@ -309,9 +344,6 @@ class PDFExtractor(Extractor):
|
|
309
344
|
bitmap.close()
|
310
345
|
page.close()
|
311
346
|
|
312
|
-
import os
|
313
|
-
import tempfile
|
314
|
-
|
315
347
|
image_paths = []
|
316
348
|
temp_files = []
|
317
349
|
|
@@ -339,43 +371,44 @@ class PDFExtractor(Extractor):
|
|
339
371
|
|
340
372
|
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
341
373
|
"""Process PDF images with the configured OCR backend."""
|
342
|
-
|
343
|
-
|
344
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
374
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
375
|
+
paths = [Path(p) for p in image_paths]
|
345
376
|
|
346
|
-
|
377
|
+
if self.config.ocr_backend == "tesseract":
|
378
|
+
config = (
|
347
379
|
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
348
380
|
)
|
349
|
-
results =
|
350
|
-
|
351
|
-
return "\n\n".join(text_parts)
|
352
|
-
|
353
|
-
if self.config.ocr_backend == "paddleocr":
|
354
|
-
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
355
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
356
|
-
|
381
|
+
results = backend.process_batch_sync(paths, **config.__dict__)
|
382
|
+
elif self.config.ocr_backend == "paddleocr":
|
357
383
|
paddle_config = (
|
358
384
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
359
385
|
)
|
360
|
-
|
361
|
-
|
362
|
-
for image_path in image_paths:
|
363
|
-
result = paddle_process(Path(image_path), paddle_config)
|
364
|
-
text_parts.append(result.content)
|
365
|
-
return "\n\n".join(text_parts)
|
366
|
-
|
367
|
-
if self.config.ocr_backend == "easyocr":
|
368
|
-
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
369
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
370
|
-
|
386
|
+
results = backend.process_batch_sync(paths, **paddle_config.__dict__)
|
387
|
+
elif self.config.ocr_backend == "easyocr":
|
371
388
|
easy_config = (
|
372
389
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
373
390
|
)
|
391
|
+
results = backend.process_batch_sync(paths, **easy_config.__dict__)
|
392
|
+
else:
|
393
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
394
|
+
|
395
|
+
text_parts = [r.content for r in results]
|
396
|
+
return "\n\n".join(text_parts)
|
397
|
+
|
398
|
+
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
399
|
+
"""Extract text using playa for better structure preservation."""
|
400
|
+
with contextlib.suppress(Exception):
|
401
|
+
content = path.read_bytes()
|
402
|
+
document = parse(content, max_workers=1)
|
374
403
|
|
375
404
|
text_parts = []
|
376
|
-
for
|
377
|
-
|
378
|
-
|
379
|
-
|
405
|
+
for page in document.pages:
|
406
|
+
# Extract text while preserving structure
|
407
|
+
page_text = page.extract_text()
|
408
|
+
if page_text and page_text.strip():
|
409
|
+
text_parts.append(page_text)
|
410
|
+
|
411
|
+
if text_parts:
|
412
|
+
return "\n\n".join(text_parts)
|
380
413
|
|
381
|
-
|
414
|
+
return fallback_text
|